import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
# Check all avaliable files
from subprocess import check_output
print(check_output(["ls", "./"]).decode("utf8"))
# Readin the data
order_products_train = pd.read_csv('./order_products__train_cap.csv')
order_products_test = pd.read_csv('./order_products__test_cap.csv')
order_products_prior = pd.read_csv('./order_products__prior.csv')
orders = pd.read_csv('./orders.csv')
products = pd.read_csv('./products.csv')
aisles = pd.read_csv('./aisles.csv')
departments = pd.read_csv('./departments.csv')
print("The shape of train data: ", order_products_train.shape)
print("The shape of test data: ", order_products_test.shape)
print("The shape of prior data: ", order_products_prior.shape)
print("The shape of orders data: ", .shape)
print("The shape of products data: ", products.shape)
print("The shape of aisles data: ", aisles.shape)
print("The shape of departments data: ", departments.shape)
print("Unique orders in the test data set: ", len(set(order_products_test.order_id)))
order_products_prior.head()
orders.head()
products = pd.merge(products, aisles, on='aisle_id', how='left')
products = pd.merge(products, departments, on='department_id', how='left')
products.head()
# Merge order_products_prior and orders on order_id
order_products_prior_detail = orders.merge(right=order_products_prior, how='inner', on='order_id')
order_products_prior_detail.shape
# Sort the datafram ascending on order number
order_products_prior_detail = order_products_prior_detail.sort_values(["user_id", "order_number","product_id"], ascending = (True, True, True))
# Create a new variable -- user_product_time: count how many times user buy the product before on each order
order_products_prior_detail.loc[:,'user_product_time'] = order_products_prior_detail.groupby(['user_id', 'product_id']).cumcount()
order_products_prior_detail.head()
print("The maximum value of the times user buy a porduct is ", max(order_products_prior_detail.user_product_time))
print("The average value of the times user buy a porduct is ", sum(order_products_prior_detail.user_product_time)/len(order_products_prior_detail.user_product_time))
products_new = order_products_prior_detail.groupby(['product_id']).agg({'user_id':{'product_total':'count'},
'reordered':{'product_reorder':'sum'},
'user_product_time': {'product_first_order':lambda x: sum(x==1),
'product_second_order':lambda x: sum(x==2),
'product_third_order':lambda x: sum(x==3),}})
products_new.head()
# Adjust the column names and reindexing rows.
products_new.columns = products_new.columns.droplevel(0)
products_new.reset_index(inplace=True)
products_new.head()
products_new.shape
# Creat some new variables:
products_new['product_reorder_pro'] = products_new.product_second_order / products_new.product_first_order
products_new['product_triorder_pro'] = products_new.product_third_order / products_new.product_second_order
products_new['product_reorder_ratio'] = products_new.product_reorder / products_new.product_total
products_new['product_reorder_times'] = 1 + products_new.product_reorder / products_new.product_first_order
products_new.head()
products_new.to_csv("products_new.csv")
order_products_prior_detail.head()
user_1 = order_products_prior_detail.groupby(['user_id']).agg(
{'order_number':{'user_total_order':'max'},
'days_since_prior_order':{'user_days_since_prior_sum':'sum',
'user_days_since_prior_avg': 'mean'}})
user_1.head()
user_2 = order_products_prior_detail.groupby(['user_id']).agg(
{'reordered':{'user_reorder_ratio': lambda x: sum(order_products_prior_detail.ix[x.index,'reordered']==1)/
sum(order_products_prior_detail.ix[x.index,'order_number'] > 1)},
'product_id':{'user_total_products':'count',
'user_distinct_products': lambda x: x.nunique()}})
user_2.head()
users = user_1.merge(user_2, how='inner', on='user_id')
users.head()
# Adjust the column names and reindexing rows.
users.columns = users.columns.droplevel(0)
users.reset_index(inplace=True)
users.head()
users.shape
#users.to_csv('user_all.csv')
order_select = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
users = users.merge(order_select, how='inner', on = 'user_id')
users.head()
users.to_csv("users.csv")
users = pd.read_csv("user_all.csv")
users.columns
users.drop(['Unnamed: 0','user_id'],axis=1, inplace=True)
users.head()
#Using the elbow method to find the optimum number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1,11):
km=KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=123)
km.fit(users)
wcss.append(km.inertia_)
plt.plot(range(1,11),wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()
##Fitting kmeans to the dataset with k=4
km4=KMeans(n_clusters=4,init='k-means++', max_iter=300, n_init=10, random_state=0)
y_means = km4.fit_predict(users)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
x = users.values
x = StandardScaler().fit_transform(x)
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
result = pd.DataFrame(data = principalComponents
, columns = ['PCA0', 'PCA1','PCA2'])
from plotly.offline import plot
import plotly.graph_objs as go
pca = PCA(n_components=3).fit(x)
X_reduced = pca.transform(x)
trace1 = go.Scatter3d(
x=X_reduced[:,0],
y = X_reduced[:,1],
z = X_reduced[:,2],
mode='markers',
marker=dict(
size=12,
color= y_means,
opacity=1
)
)
dc_1 = go.Scatter3d( x = [0,pca.components_.T[0][0]],
y = [0,pca.components_.T[0][1]],
z = [0,pca.components_.T[0][2]],
marker = dict( size = 1,
color = "rgb(84,48,5)"),
line = dict( color = "red",
width = 6),
name = "Var1"
)
dc_2 = go.Scatter3d( x = [0,pca.components_.T[1][0]],
y = [0,pca.components_.T[1][1]],
z = [0,pca.components_.T[1][2]],
marker = dict( size = 1,
color = "rgb(84,48,5)"),
line = dict( color = "green",
width = 6),
name = "Var2"
)
dc_3 = go.Scatter3d( x = [0,pca.components_.T[2][0]],
y = [0,pca.components_.T[2][1]],
z = [0,pca.components_.T[2][2]],
marker = dict( size = 1,
color = "rgb(84,48,5)"),
line = dict( color = "blue",
width = 6),
name = "Var3"
)
dc_4 = go.Scatter3d( x = [0,pca.components_.T[3][0]],
y = [0,pca.components_.T[3][1]],
z = [0,pca.components_.T[3][2]],
marker = dict( size = 1,
color = "rgb(84,48,5)"),
line = dict( color = "yellow",
width = 6),
name = "Var4"
)
data = [trace1,dc_1,dc_2,dc_3,dc_4]
layout = go.Layout(
xaxis=dict(
title='PC1',
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
)
)
fig = go.Figure(data=data, layout=layout)
fig.show()
#plot(fig, filename='3d-scatter-tupac-with-mac')